#!pip install pandas-profiling
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import os
import pandas_profiling
for f in os.listdir():
print(f.ljust(30) +"--" + str(round(os.path.getsize(f) / 1000000, 2)) + 'MB')
df=pd.read_csv('Bank_Personal_Loan_Modelling.csv')
df.shape
df.info()
df.head()
df.describe(include='all')
df.profile_report()
df[df['Experience']<0].describe()
df[df['Experience']<0].Experience.value_counts()
plt.figure(figsize=(16,8))
sns.heatmap(df.corr(), annot=True)
print('Experience and Age shows very high correlation, will try to replace negative value from same age grop in the fields')
print('Experience has relation with Education as well but we dont see much on heatmap.')
df[df['Experience']== -1].Age.value_counts()
df[df['Experience']== -1].Education.value_counts()
df[df['Experience']== -2].Age.value_counts()
df[df['Experience']== -3].Age.value_counts()
df[df['Age']== 23].Experience.value_counts()
df[df['Age']== 23].Education.value_counts()
df[df['Age']== 24].Experience.value_counts()
df[df['Age']== 24].Education.value_counts()
df.loc[(df['Age']== 23),'Experience']=0
df.loc[(df['Age']== 24),'Experience']=0
df[df['Experience']== -2].Age.value_counts()
df[df['Experience']== -2]
print(df[(df['Age']== 25) & (df['Education']== 3) & (df['Experience']> -1)].Experience.describe())
print('\nWe will replace the values with median of 0 for age 25, considering the age and eduction of 3')
df.loc[((df['Age']== 25) & (df['Education']== 3) & (df['Experience']< 0)),'Experience']=0
print(df[(df['Age']== 28) & (df['Education']== 3) & (df['Experience']> -1)].Experience.describe())
print('\nWe will replace the values with median of 0 for age 28, considering the age and eduction of 3')
df.loc[((df['Age']== 28) & (df['Education']== 3) & (df['Experience']== -2)),'Experience']=3
df[df['Experience']== -2].Age.count()
df[df['Experience']== -1].Age.value_counts()
df[df['Experience']== -1]
print(df[(df['Age']== 25) & (df['Education']==1) & (df['Experience']> -1)].Experience.describe())
print(df[(df['Age']== 25) & (df['Education']==2) & (df['Experience']> -1)].Experience.describe())
print('We will replace the values with 50% of 1 for age 25 and edu - 2,1')
print(df[(df['Age']== 26) & (df['Education']== 2) & (df['Experience']> -1)].Experience.describe())
print('We will replace the values with median of 1 for age 26 and edu - 2')
print(df[(df['Age']== 29) & (df['Education']== 3) & (df['Experience']> -1)].Experience.describe())
print('We will replace the values with median of 3 for age 29 and edu 3')
df.loc[((df['Age']== 29) & (df['Experience']< 0)),'Experience']=3
df.loc[((df['Age']== 26) & (df['Experience']< 0)),'Experience']=1
df.loc[((df['Age']== 25) & (df['Experience']< 0)),'Experience']=1
df[df['Experience']<0].Experience.count()
df.isna().sum()
df.isnull().sum()
sns.pairplot(df,hue='Personal Loan',diag_kind='hist')
df_select=df.drop(['ID','ZIP Code','Securities Account','CD Account','Online','CreditCard'],axis=1)
#dropping few cols to get a better picture
sns.pairplot(df_select)
print("with first glance can clearly notice high correlation with age and experience, income and CCAvg, mortgage with Income")
df_select=df_select.astype({"Personal Loan":str})
sns.pairplot(df_select,hue="Personal Loan")
plt.show()
print('''For Age/Experience, Loan distribution looks like evenly distributed,
For Income, shows high presence on loan acceptance for high income group
For Family, have comparativly higer presne of loan acceptance in family members with 3 or 4
For CCAvg, higer CCAvg group members have higher chances of acceptance of Loan
For Education, Loan acceptance grew with education level
For Mortgage, doenst provide much data from this view''')
sns.distplot(df['Age'])
print(df['Personal Loan'].value_counts())
print('Persons taking loan are: 9.6% ')
#ID has no relation with anything hence skipping it.
#Age
#What age group is taking loan
print(df[df['Personal Loan']==1].Age.describe())
print('''\nAge looks distributed across the people taking personal loan.
Majority of cutomers are 35-65 considering 25%.
''')
#distribution of the age group
sns.distplot(df[df['Personal Loan']==1].Age,bins=10,color='r',rug=True)
sns.distplot(df[df['Personal Loan']==0].Age,bins=10,color='g',rug=True)
plt.show()
#distribution of the age group
#have to make kde false else for the kde representaion its showing both graphs on same scaled level
sns.distplot(df[df['Personal Loan']==0]['Age'],bins=10,color='g',kde=False,rug=True);
sns.distplot(df[df['Personal Loan']==1]['Age'],bins=10,color='r',kde=False,rug=True);
plt.show();
print(''' Personal Loan customers looks distributed across the age group''')
#Experience
print(df[df['Personal Loan']==1].Experience.describe())
print('''\nAge and Experience are highly correlated and expecting same behaviour with Loan as well.
''')
sns.distplot(df[df['Personal Loan']==1].Experience,bins=5)
sns.distplot(df[df['Personal Loan']==0]['Experience'],bins=10,label='Loan 0',kde=False,rug=True);
sns.distplot(df[df['Personal Loan']==1]['Experience'],bins=10,label='Loan 1',kde=False,rug=True);
plt.legend();
plt.show();
print(''' Personal Loan customers looks distributed across the experience range as in age.''')
#Income
print(df[df['Personal Loan']==1].Income.describe())
print(df[df['Personal Loan']==0].Income.describe())
print('''\n Income for cust taken personal loan are ranging from 60-203. For others its ranging from 8-224.
This details suggests the personal loan is usually not accepted by income group below 60k
and considering the 25% value of 122K, there is higher chance of accepatnce of
Personal loan for income group above 122K.
''')
sns.distplot(df[df['Personal Loan']==0]['Income'],kde=False,label='Loan 0',rug=True);
sns.distplot(df[df['Personal Loan']==1]['Income'],kde=False,label='Loan 1',rug=True);
plt.legend();
plt.show();
print('''from this image we can assume the personal loan has more cahnce of acceptance towards higher income group.
From above details as well we can notice the distribution of customers taking loan is higher above 122K income.
The extream right and left of graph suggests these grousp are not so inclienced towards the loan.
There is a very high rate of conversion for income group at range from 150-200.
''')
#Family
print(df[df['Personal Loan']==1].Family.describe())
print(df[df['Personal Loan']==0].Family.describe())
sns.countplot(data=df,x='Family',hue='Personal Loan')
plt.show();
print('''
Personal Loan customer looks distributed in all range of family but family memeber with 3 and 4 have higher presence compared to others.
''')
#CCAvg
print(df[df['Personal Loan']==1].CCAvg.describe())
print(df[df['Personal Loan']==0].CCAvg.describe())
print('''
Personal Loan cust are distributed but cust having higher CCAvg (>8.8) have higer acceptance towards personal Loan.
Cust not taking Loan have a max of 8.8 compared to 10 in cust taking Loan.
Cust having CCAvg ranging from 2.6-5.35 makes 50% of customers taking loan.
''')
sns.distplot(df[df['Personal Loan']==0]['CCAvg'],kde=False,label='Loan 0',rug=True);
sns.distplot(df[df['Personal Loan']==1]['CCAvg'],kde=False,label='Loan 1',rug=True);
plt.legend();
plt.show();
#Education
print(df[df['Personal Loan']==1].Education.describe())
print(df[df['Personal Loan']==0].Education.describe())
sns.boxplot(data=df,x='Education',y='Income')
print('Mean average seems slightly higher for Undergrads')
sns.boxplot(data=df,x='Education',y='Income',hue='Personal Loan')
print('Personal Loan is more choosen by education group 2 and 3')
sns.countplot(data=df,x='Education',hue='Personal Loan')
plt.show();
print('''
Personal Loan customer presence is more in education group 3 and 2 compared to 1.
There is higher chance of conversion rate if the cust is graduate/Advanced/Professional education level
''')
#Mortgage
print(df[df['Personal Loan']==1].Mortgage.describe())
print(df[df['Personal Loan']==0].Mortgage.describe())
plt.figure(figsize=(16,8))
sns.distplot(df[df['Personal Loan']==1]['Mortgage'],kde=False,label='Loan 1');
sns.distplot(df[df['Personal Loan']==0]['Mortgage'],kde=False,label='Loan 0');
plt.legend();
plt.show();
#Eliminating the mortgage 0 as its making the view very difficult to read
plt.figure(figsize=(16,8))
sns.distplot(df[(df['Personal Loan']==0) & (df['Mortgage']>0) ]['Mortgage'],kde=False,label='Loan 0',rug=True);
sns.distplot(df[(df['Personal Loan']==1) & (df['Mortgage']>0) ]['Mortgage'],kde=False,label='Loan 1',rug=True);
plt.legend();
plt.show();
print('''
Higher chance of conversion to Loan for cust having Mortgage above 280/300K
(from 50% in below table)
''')
print(df[(df['Personal Loan']==1) & (df['Mortgage']>0)].Mortgage.describe())
print(df[(df['Personal Loan']==0) & (df['Mortgage']>0)].Mortgage.describe())
print('''
Customers having no Mortage have very likelihood of taking PErsonal Loan.
For other groups of having varying range of mortages have mostly steady distribution.
Comparatively customers having mortgages having more than approx. 280K have higher chances
of accepting personal loan (may be because of liqiudity crunch due to higher mortage)
''')
#Securities Account
print(df[df['Personal Loan']==1]['Securities Account'].describe())
print(df[df['Personal Loan']==0]['Securities Account'].describe())
print("No relation can be inferred")
sns.boxplot(data=df,x='Securities Account',y='Income',hue='Personal Loan')
print("Distribution looks even in both groups")
df['Securities Account'].value_counts()
df[df['Personal Loan']==1]['Securities Account'].value_counts()
df[df['Personal Loan']==0]['Securities Account'].value_counts()
print('Percentage of conversin for people having sec account - '+str((60/522)*100))
print('Percentage of conversin for people not having sec account - '+str((420/4478)*100))
sns.countplot(data=df,x='Securities Account',hue='Personal Loan');
ax=sns.countplot(data=df,x='Securities Account',hue='Personal Loan')
total = float(len(df))
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+p.get_width()/2.,
height + 3,
'{:1.2f}'.format(height/total),
ha="center")
#over all percentage on top of bars #code referenced from https://stackoverflow.com/questions/31749448/how-to-add-percentages-on-top-of-bars-in-seaborn
plt.show();
print('''
Customers not having securities loan have higher conversion but the number of
cust not having Sec Account is also higher.
Not able to gather much relation from the distribution on securities account''')
#code referenced from https://stackoverflow.com/questions/31749448/how-to-add-percentages-on-top-of-bars-in-seaborn
def with_hue(plot, feature, Number_of_categories, hue_categories):
a = [p.get_height() for p in plot.patches]
patch = [p for p in plot.patches]
for i in range(Number_of_categories):
total = feature.value_counts().values[i]
for j in range(hue_categories):
percentage = '{:.1f}%'.format(100 * a[(j*Number_of_categories + i)]/total)
x = patch[(j*Number_of_categories + i)].get_x() + patch[(j*Number_of_categories + i)].get_width() / 2 - 0.15
y = patch[(j*Number_of_categories + i)].get_y() + patch[(j*Number_of_categories + i)].get_height()
ax.annotate(percentage, (x, y), size = 12)
plt.show()
ax=sns.countplot(data=df,x='Securities Account',hue='Personal Loan')
with_hue(ax,df['Securities Account'],2,2)
pd.crosstab(df['Securities Account'],df['Personal Loan'])
print('Statiscally people having securietes account have more chances of conversion.')
#CD Account
pd.crosstab(df['CD Account'],df['Personal Loan'])
ax=sns.countplot(data=df,x='CD Account',hue='Personal Loan')
with_hue(ax,df['CD Account'],2,2)
print('Percentage of conversin for people having CD account - '+str((140/(140+162))*100))
print('Percentage of conversin for people not having CD account - '+str((340/(4358+340))*100))
print('Visually & Statiscally people having CD account have higher chances of conversion for Loan.')
# Online
pd.crosstab(df['Online'],df['Personal Loan'])
ax=sns.countplot(data=df,x='Online',hue='Personal Loan')
#with_hue(ax,df['Online'],2,2)
print('Percentage of conversin for people having online account - '+str((291/(2693+291))*100))
print('Percentage of conversin for people not having online account - '+str((189/(1827+189))*100))
print("Online doesnt show a impact on the loan acceptance")
#Credit Card
pd.crosstab(df['CreditCard'],df['Personal Loan'])
ax=sns.countplot(data=df,x='CreditCard',hue='Personal Loan')
with_hue(ax,df['CreditCard'],2,2)
print('Percentage of conversin for people having CC - '+str((143/(143+1327))*100))
print('Percentage of conversin for people not having CC - '+str((337/(337+3193))*100))
print("No impact of having CC")
#Getting Target column
print(pd.DataFrame(df['Personal Loan']).info())
print()
print(df['Personal Loan'].value_counts())
print('''\n
Total 5000 records out of which only 480 customers have accepted the Loan.
''')
print("\nOut of total " + str(df['Personal Loan'].count()) + ", percentage customers opted for loan are: "+ str(df['Personal Loan'].value_counts(1)[1]*100))
df['Personal Loan'].value_counts(1)
print(df['Personal Loan'].value_counts())
print("\nOut of total " + str(df['Personal Loan'].count()) + ", percentage customers opted for loan are: "+ str(df['Personal Loan'].value_counts(1)[1]*100))
sns.countplot(data=df,x='Personal Loan')
plt.show()
#graphs credit - Referenced from https://www.datacamp.com/community/tutorials/categorical-data
labels = df['Personal Loan'].astype('category').cat.categories.tolist()
counts = df['Personal Loan'].value_counts()
sizes = [counts[var_cat] for var_cat in labels]
fig1, ax1 = plt.subplots()
ax1.pie(sizes, labels=labels, autopct='%1.1f%%', shadow=True) #autopct is show the % on plot
ax1.axis('equal')
plt.show()
print('''There is a huge gap between customers accepting the Loan and total customers,
class distribution is severely skewed.
This may results in models that have poor predictive performance,
specifically for the minority class.''')
#https://www.datacamp.com/community/tutorials/diving-deep-imbalanced-data?utm_source=adwords_ppc&utm_campaignid=1455363063&utm_adgroupid=65083631748&utm_device=c&utm_keyword=&utm_matchtype=b&utm_network=g&utm_adpostion=&utm_creative=332602034358&utm_targetid=aud-392016246653:dsa-429603003980&utm_loc_interest_ms=&utm_loc_physical_ms=9061996&gclid=Cj0KCQjwvvj5BRDkARIsAGD9vlKQoq5lRfZhmcZAwNSvWsmJM1EIepUab4d5F2WH24kIiOE2Gt7oA3QaApJ2EALw_wcB
plt.figure(figsize=(16,8))
sns.heatmap(df.corr(), annot=True)
df.corr()['Personal Loan'].sort_values(ascending=False)
print('''
As analyzed above, Income, CCAvg has highest linear correlation with Loan.
Age and Experience are highly correlated but Experience is more negatively correlated with Loan then Age.
ZIP Code has least correlation with target along with CreditCard and Online
''')
#Split the data into training and test set in the ratio of 70:30 respectively
#Testing with majority data with no changes, except ID and Age (Age and Experience are highly correlated and dropping one will reduce the complexity)
X=df.drop(['ID','Age','Personal Loan'],axis=1)
y=df['Personal Loan']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=5)
X_train.info()
print("Train and Test were splitted into 70:30")
y_train.value_counts(1)
y_test.value_counts(1)
print("Train and Test data sets have similar distribution of target variable")
#Use different classification models (Logistic, K-NN and Naïve Bayes) to predict the likelihood of a customer buying personal loans
#Print the confusion matrix for all the above models
#logistic regression
from sklearn.linear_model import LogisticRegression
logRegModel=LogisticRegression()
logRegModel.fit(X_train,y_train)
y_predict=logRegModel.predict(X_test)
from sklearn.metrics import accuracy_score,confusion_matrix,recall_score,f1_score,precision_score,roc_curve,log_loss,auc
print('Accuracy score:',accuracy_score(y_test,y_predict))
print('confuion matrix:\n',confusion_matrix(y_test,y_predict))
print('Recall Score: ',recall_score(y_test, y_predict))
print('Precission Score: ',precision_score(y_test, y_predict))
print('F1 Score: ',f1_score(y_test, y_predict))
modelComp=pd.DataFrame({'Model':['Logistic Regression - 0.5'],'Accuracy':[accuracy_score(y_test,y_predict)*100],'Precission':[precision_score(y_test, y_predict)*100],'Recall':[recall_score(y_test, y_predict)*100]})
#Logistic Regression with default thresold - Recall - 33% and Accuracy - 91.2%
#referenced from training materials
from sklearn import metrics
def draw_cm( actual, predicted ):
cm = metrics.confusion_matrix( actual, predicted, [0,1] )
sns.heatmap(cm, annot=True, fmt='.0f', xticklabels = ["Loan 0", "Loan 1"] , yticklabels = ["Loan 0", "Loan 1"] )
plt.ylabel('True label')
plt.xlabel('Predicted label')
plt.show()
draw_cm(y_test, y_predict)
from sklearn.preprocessing import binarize
#changing the threshold to 0.3
y_pred_class = binarize([logRegModel.predict_proba(X_test)[:, 1]], 0.3)[0]
print('Accuracy score:',accuracy_score(y_test,y_pred_class))
print('confuion matrix:\n',confusion_matrix(y_test,y_pred_class))
print('Recall Score: ',recall_score(y_test, y_pred_class))
print('Precission Score: ',precision_score(y_test, y_pred_class))
print('F1 Score: ',f1_score(y_test, y_pred_class))
draw_cm(y_test, y_pred_class)
modelComp=modelComp.append(pd.DataFrame({'Model':['Logistic Regression - 0.3'],'Accuracy':[accuracy_score(y_test,y_pred_class)*100],'Precission':[precision_score(y_test, y_pred_class)*100],'Recall':[recall_score(y_test, y_pred_class)*100]}))
#Logistic Regression with threshold decrease to .3 Recall increased to 53.7% and Accuracy - 89.9%
#referenced from training materials
y_pred_proba = logRegModel.predict_proba(X_test)[:, 1]
[fpr, tpr, thr] = roc_curve(y_test, y_pred_proba)
plt.figure()
plt.plot(fpr, tpr, color='coral', label='ROC curve (area = %0.3f)' % auc(fpr, tpr))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (1 - specificity)', fontsize=14)
plt.ylabel('True Positive Rate (recall)', fontsize=14)
plt.title('Receiver operating characteristic (ROC) curve')
plt.legend(loc="lower right")
plt.show()
#Stats Model Logit
#!pip install statsmodels
import statsmodels.api as sm
logit = sm.Logit( y_train, sm.add_constant( X_train ) )
lg = logit.fit()
#lg.summary2()
y_predict=pd.DataFrame(lg.predict( sm.add_constant( X_test ) ))
#print(y_predict[0:5])
#y_predict.info()
#referenced from traing materials
def get_predictions( y_test, X_test,model ):
y_pred_df = pd.DataFrame( { 'actual': y_test,
"predicted_prob": model.predict( sm.add_constant( X_test ) ) } )
return y_pred_df
y_pred_df = get_predictions(y_test,X_test, lg )
y_pred_df.head()
y_pred_df['predicted'] = y_pred_df.predicted_prob.map( lambda x: 1 if x > 0.3 else 0)
y_pred_df.head()
y_predict=y_pred_df['predicted']
#y_predict = y_predict.apply( lambda x: 1 if x > 0.6 else 0)
print('Accuracy score:',accuracy_score(y_test,y_predict))
print('confuion matrix:\n',confusion_matrix(y_test,y_predict))
print('Recall Score: ',recall_score(y_test, y_predict))
print('Precission Score: ',precision_score(y_test, y_predict))
print('F1 Score: ',f1_score(y_test, y_predict))
#Logit Recall - 72.5% and Accuracy - 94.07%
draw_cm(y_test, y_predict)
modelComp=modelComp.append(pd.DataFrame({'Model':['Logit -StatsModel - 0.3'],'Accuracy':[accuracy_score(y_test,y_predict)*100],'Precission':[precision_score(y_test, y_predict)*100],'Recall':[recall_score(y_test, y_predict)*100]}))
y_pred_proba = y_pred_df.predicted_prob
[fpr1, tpr1, thr1] = roc_curve(y_test, y_pred_proba)
plt.figure()
plt.clf()
plt.plot(fpr, tpr, color='coral', label='ROC curve LogReg (area = %0.2f)' % auc(fpr, tpr))
plt.plot(fpr1, tpr1, color='c', label='ROC curve Logit (area = %0.2f)' % auc(fpr1, tpr1))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiverrating characteristic example')
plt.legend(loc="lower right")
plt.show()
#KNN
from sklearn.neighbors import KNeighborsClassifier
KnnModel = KNeighborsClassifier(n_neighbors=3)
KnnModel.fit(X_train,y_train)
y_predict=KnnModel.predict(X_test)
print('Accuracy score:',accuracy_score(y_test,y_predict))
print('confuion matrix:\n',confusion_matrix(y_test,y_predict))
print('Recall Score: ',recall_score(y_test, y_predict))
print('Precission Score: ',precision_score(y_test, y_predict))
print('F1 Score: ',f1_score(y_test, y_predict))
draw_cm(y_test, y_predict)
modelComp=modelComp.append(pd.DataFrame({'Model':['KNN - 3 Neigbours'],'Accuracy':[accuracy_score(y_test,y_predict)*100],'Precission':[precision_score(y_test, y_predict)*100],'Recall':[recall_score(y_test, y_predict)*100]}))
#KNN recall - 32.9% Accuracy - 90.5%
y_pred_proba = KnnModel.predict_proba(X_test)[:, 1]
[fpr2, tpr2, thr2] = roc_curve(y_test, y_pred_proba)
plt.figure()
plt.clf()
plt.plot(fpr, tpr, color='coral', label='ROC curve LogReg (area = %0.2f)' % auc(fpr, tpr))
plt.plot(fpr1, tpr1, color='c', label='ROC curve Logit (area = %0.2f)' % auc(fpr1, tpr1))
plt.plot(fpr2, tpr2, color='g', label='ROC curve KNN (area = %0.2f)' % auc(fpr2, tpr2))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiverrating characteristic example')
plt.legend(loc="lower right")
plt.show()
#Naive bayes - GaussianNB
from sklearn.naive_bayes import GaussianNB,BernoulliNB
NBGauModel = GaussianNB()
NBGauModel.fit(X_train,y_train)
y_predict=NBGauModel.predict(X_test)
print('Accuracy score:',accuracy_score(y_test,y_predict))
print('confuion matrix:\n',confusion_matrix(y_test,y_predict))
print('Recall Score: ',recall_score(y_test, y_predict))
print('Precission Score: ',precision_score(y_test, y_predict))
print('F1 Score: ',f1_score(y_test, y_predict))
draw_cm(y_test, y_predict)
modelComp=modelComp.append(pd.DataFrame({'Model':['Naive Bayes - Gaussian'],'Accuracy':[accuracy_score(y_test,y_predict)*100],'Precission':[precision_score(y_test, y_predict)*100],'Recall':[recall_score(y_test, y_predict)*100]}))
#NB Gaussian - Recall - 55.7% Accuracy - 88.13%
y_pred_proba = NBGauModel.predict_proba(X_test)[:, 1]
[fpr3, tpr3, thr3] = roc_curve(y_test, y_pred_proba)
plt.figure()
plt.clf()
plt.plot(fpr, tpr, color='coral', label='ROC curve LogReg (area = %0.2f)' % auc(fpr, tpr))
plt.plot(fpr1, tpr1, color='c', label='ROC curve Logit (area = %0.2f)' % auc(fpr1, tpr1))
plt.plot(fpr2, tpr2, color='g', label='ROC curve KNN (area = %0.2f)' % auc(fpr2, tpr2))
plt.plot(fpr3, tpr3, color='b', label='ROC curve NaiveBayes (area = %0.2f)' % auc(fpr3, tpr3))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiverrating characteristic example')
plt.legend(loc="lower right")
plt.show()
# BernoulliNB
NBBernoulliModel = BernoulliNB()
NBBernoulliModel.fit(X_train,y_train)
y_predict=NBBernoulliModel.predict(X_test)
print('Accuracy score:',accuracy_score(y_test,y_predict))
print('confuion matrix:\n',confusion_matrix(y_test,y_predict))
print('Recall Score: ',recall_score(y_test, y_predict))
print('Precission Score: ',precision_score(y_test, y_predict))
print('F1 Score: ',f1_score(y_test, y_predict))
draw_cm(y_test, y_predict)
modelComp=modelComp.append(pd.DataFrame({'Model':['Naive Bayes - Bernoulli'],'Accuracy':[accuracy_score(y_test,y_predict)*100],'Precission':[precision_score(y_test, y_predict)*100],'Recall':[recall_score(y_test, y_predict)*100]}))
#NB Bernoulli - Recall - 11.4% Accuracy - 88.7%
y_pred_proba = NBBernoulliModel.predict_proba(X_test)[:, 1]
[fpr4, tpr4, thr4] = roc_curve(y_test, y_pred_proba)
plt.figure()
plt.clf()
plt.plot(fpr, tpr, color='coral', label='ROC curve LogReg (area = %0.2f)' % auc(fpr, tpr))
plt.plot(fpr1, tpr1, color='c', label='ROC curve Logit (area = %0.2f)' % auc(fpr1, tpr1))
plt.plot(fpr2, tpr2, color='g', label='ROC curve KNN (area = %0.2f)' % auc(fpr2, tpr2))
plt.plot(fpr3, tpr3, color='b', label='ROC curve NaiveBayes (area = %0.2f)' % auc(fpr3, tpr3))
plt.plot(fpr4, tpr4, color='r', label='ROC curve Bernoulli (area = %0.2f)' % auc(fpr4, tpr4))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiverrating characteristic example')
plt.legend(loc="lower right")
plt.show()
#Give your reasoning on which is the best model in this case and why it performs better
y_pred_proba = NBBernoulliModel.predict_proba(X_test)[:, 1]
[fpr4, tpr4, thr4] = roc_curve(y_test, y_pred_proba)
plt.figure()
plt.clf()
plt.plot(fpr, tpr, color='coral', label='ROC curve LogReg (area = %0.2f)' % auc(fpr, tpr))
plt.plot(fpr1, tpr1, color='c', label='ROC curve Logit (area = %0.2f)' % auc(fpr1, tpr1))
plt.plot(fpr2, tpr2, color='g', label='ROC curve KNN (area = %0.2f)' % auc(fpr2, tpr2))
plt.plot(fpr3, tpr3, color='b', label='ROC curve NaiveBayes (area = %0.2f)' % auc(fpr3, tpr3))
plt.plot(fpr4, tpr4, color='r', label='ROC curve Bernoulli (area = %0.2f)' % auc(fpr4, tpr4))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiverrating characteristic example')
plt.legend(loc="lower right")
plt.show()
modelComp
# Additional theory details on NB and Logistic Regression referenced from - https://medium.com/@sangha_deb/naive-bayes-vs-logistic-regression-a319b07a5d4c#:~:text=Naive%20Bayes%20also%20assumes%20that,will%20be%20a%20better%20classifier.
#modelComp.drop(modelComp.index, inplace=True)
# Next Approach to reduce few less significant columns from data set and captuer performance on model (reduce complexity)
#X=df.drop(['ID','Age','Personal Loan'],axis=1)
X=df.drop(['ID','Personal Loan','Age','CreditCard','Online','ZIP Code'],axis=1)
#X=df.drop(['ID','Personal Loan','Age','Online','ZIP Code'],axis=1)
y=df['Personal Loan']
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=5)
#Logistic Regression
from sklearn.linear_model import LogisticRegression
logRegModel=LogisticRegression(max_iter=1000)
logRegModel.fit(X_train,y_train)
y_predict=logRegModel.predict(X_test)
from sklearn.metrics import accuracy_score,confusion_matrix,recall_score,f1_score,precision_score,roc_curve,log_loss,auc
print('Accuracy score:',accuracy_score(y_test,y_predict))
print('confuion matrix:\n',confusion_matrix(y_test,y_predict))
print('Recall Score: ',recall_score(y_test, y_predict))
print('Precission Score: ',precision_score(y_test, y_predict))
print('F1 Score: ',f1_score(y_test, y_predict))
modelComp2=pd.DataFrame({'Model':['Logistic Regression - 0.5'],'Accuracy':[accuracy_score(y_test,y_predict)*100],'Precission':[precision_score(y_test, y_predict)*100],'Recall':[recall_score(y_test, y_predict)*100]})
draw_cm(y_test, y_predict)
#changing the threshold to 0.3
y_pred_class = binarize([logRegModel.predict_proba(X_test)[:, 1]], 0.3)[0]
print('Accuracy score:',accuracy_score(y_test,y_pred_class))
print('confuion matrix:\n',confusion_matrix(y_test,y_pred_class))
print('Recall Score: ',recall_score(y_test, y_pred_class))
print('Precission Score: ',precision_score(y_test, y_pred_class))
print('F1 Score: ',f1_score(y_test, y_pred_class))
draw_cm(y_test, y_pred_class)
modelComp2=modelComp2.append(pd.DataFrame({'Model':['Logistic Regression - 0.3'],'Accuracy':[accuracy_score(y_test,y_pred_class)*100],'Precission':[precision_score(y_test, y_pred_class)*100],'Recall':[recall_score(y_test, y_pred_class)*100]}))
y_pred_proba = logRegModel.predict_proba(X_test)[:, 1]
[fpr, tpr, thr] = roc_curve(y_test, y_pred_proba)
plt.figure()
plt.plot(fpr, tpr, color='coral', label='ROC curve (area = %0.3f)' % auc(fpr, tpr))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (1 - specificity)', fontsize=14)
plt.ylabel('True Positive Rate (recall)', fontsize=14)
plt.title('Receiver operating characteristic (ROC) curve')
plt.legend(loc="lower right")
plt.show()
#KNN
from sklearn.neighbors import KNeighborsClassifier
KnnModel = KNeighborsClassifier(n_neighbors=3)
KnnModel.fit(X_train,y_train)
y_predict=KnnModel.predict(X_test)
print('Accuracy score:',accuracy_score(y_test,y_predict))
print('confuion matrix:\n',confusion_matrix(y_test,y_predict))
print('Recall Score: ',recall_score(y_test, y_predict))
print('Precission Score: ',precision_score(y_test, y_predict))
print('F1 Score: ',f1_score(y_test, y_predict))
draw_cm(y_test, y_predict)
modelComp2=modelComp2.append(pd.DataFrame({'Model':['KNN - 3 Neigbours'],'Accuracy':[accuracy_score(y_test,y_predict)*100],'Precission':[precision_score(y_test, y_predict)*100],'Recall':[recall_score(y_test, y_predict)*100]}))
y_pred_proba = KnnModel.predict_proba(X_test)[:, 1]
[fpr2, tpr2, thr2] = roc_curve(y_test, y_pred_proba)
plt.figure()
plt.clf()
plt.plot(fpr, tpr, color='coral', label='ROC curve LogReg (area = %0.2f)' % auc(fpr, tpr))
plt.plot(fpr2, tpr2, color='g', label='ROC curve KNN (area = %0.2f)' % auc(fpr2, tpr2))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiverrating characteristic example')
plt.legend(loc="lower right")
plt.show()
# NB - Gaussian
from sklearn.naive_bayes import GaussianNB,BernoulliNB
NBGauModel = GaussianNB()
NBGauModel.fit(X_train,y_train)
y_predict=NBGauModel.predict(X_test)
print('Accuracy score:',accuracy_score(y_test,y_predict))
print('confuion matrix:\n',confusion_matrix(y_test,y_predict))
print('Recall Score: ',recall_score(y_test, y_predict))
print('Precission Score: ',precision_score(y_test, y_predict))
print('F1 Score: ',f1_score(y_test, y_predict))
draw_cm(y_test, y_predict)
modelComp2=modelComp2.append(pd.DataFrame({'Model':['Naive Bayes - Gaussian'],'Accuracy':[accuracy_score(y_test,y_predict)*100],'Precission':[precision_score(y_test, y_predict)*100],'Recall':[recall_score(y_test, y_predict)*100]}))
y_pred_proba = NBGauModel.predict_proba(X_test)[:, 1]
[fpr3, tpr3, thr3] = roc_curve(y_test, y_pred_proba)
plt.figure()
plt.clf()
plt.plot(fpr, tpr, color='coral', label='ROC curve LogReg (area = %0.2f)' % auc(fpr, tpr))
plt.plot(fpr2, tpr2, color='g', label='ROC curve KNN (area = %0.2f)' % auc(fpr2, tpr2))
plt.plot(fpr3, tpr3, color='b', label='ROC curve NaiveBayes (area = %0.2f)' % auc(fpr3, tpr3))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiverrating characteristic example')
plt.legend(loc="lower right")
plt.show()
modelComp
modelComp2
# Additional theory details on NB and Logistic Regression referenced from - https://medium.com/@sangha_deb/naive-bayes-vs-logistic-regression-a319b07a5d4c#:~:text=Naive%20Bayes%20also%20assumes%20that,will%20be%20a%20better%20classifier.
# Next - scaling the data - as values in the columns are on very diff scale then each other
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.fit_transform(X_test)
#Logistic Regression
from sklearn.linear_model import LogisticRegression
logRegModel=LogisticRegression()
logRegModel.fit(X_train_scaled,y_train)
y_predict=logRegModel.predict(X_test_scaled)
from sklearn.metrics import accuracy_score,confusion_matrix,recall_score,f1_score,precision_score,roc_curve,log_loss,auc
print('Accuracy score:',accuracy_score(y_test,y_predict))
print('confuion matrix:\n',confusion_matrix(y_test,y_predict))
print('Recall Score: ',recall_score(y_test, y_predict))
print('Precission Score: ',precision_score(y_test, y_predict))
print('F1 Score: ',f1_score(y_test, y_predict))
modelComp3=pd.DataFrame({'Model':['Logistic Regression - 0.5'],'Accuracy':[accuracy_score(y_test,y_predict)*100],'Precission':[precision_score(y_test, y_predict)*100],'Recall':[recall_score(y_test, y_predict)*100]})
draw_cm(y_test, y_predict)
#changing the threshold to 0.3
y_pred_class = binarize([logRegModel.predict_proba(X_test_scaled)[:, 1]], 0.3)[0]
print('Accuracy score:',accuracy_score(y_test,y_pred_class))
print('confuion matrix:\n',confusion_matrix(y_test,y_pred_class))
print('Recall Score: ',recall_score(y_test, y_pred_class))
print('Precission Score: ',precision_score(y_test, y_pred_class))
print('F1 Score: ',f1_score(y_test, y_pred_class))
draw_cm(y_test, y_pred_class)
modelComp3=modelComp3.append(pd.DataFrame({'Model':['Logistic Regression - 0.3'],'Accuracy':[accuracy_score(y_test,y_pred_class)*100],'Precission':[precision_score(y_test, y_pred_class)*100],'Recall':[recall_score(y_test, y_pred_class)*100]}))
y_pred_proba = logRegModel.predict_proba(X_test_scaled)[:, 1]
[fpr, tpr, thr] = roc_curve(y_test, y_pred_proba)
plt.figure()
plt.plot(fpr, tpr, color='coral', label='ROC curve (area = %0.3f)' % auc(fpr, tpr))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate (1 - specificity)', fontsize=14)
plt.ylabel('True Positive Rate (recall)', fontsize=14)
plt.title('Receiver operating characteristic (ROC) curve')
plt.legend(loc="lower right")
plt.show()
#KNN
from sklearn.neighbors import KNeighborsClassifier
KnnModel = KNeighborsClassifier(n_neighbors=3)
KnnModel.fit(X_train_scaled,y_train)
y_predict=KnnModel.predict(X_test_scaled)
print('Accuracy score:',accuracy_score(y_test,y_predict))
print('confuion matrix:\n',confusion_matrix(y_test,y_predict))
print('Recall Score: ',recall_score(y_test, y_predict))
print('Precission Score: ',precision_score(y_test, y_predict))
print('F1 Score: ',f1_score(y_test, y_predict))
draw_cm(y_test, y_predict)
modelComp3=modelComp3.append(pd.DataFrame({'Model':['KNN - 3 Neigbours'],'Accuracy':[accuracy_score(y_test,y_predict)*100],'Precission':[precision_score(y_test, y_predict)*100],'Recall':[recall_score(y_test, y_predict)*100]}))
y_pred_proba = KnnModel.predict_proba(X_test_scaled)[:, 1]
[fpr2, tpr2, thr2] = roc_curve(y_test, y_pred_proba)
plt.figure()
plt.clf()
plt.plot(fpr, tpr, color='coral', label='ROC curve LogReg (area = %0.2f)' % auc(fpr, tpr))
plt.plot(fpr2, tpr2, color='g', label='ROC curve KNN (area = %0.2f)' % auc(fpr2, tpr2))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiverrating characteristic example')
plt.legend(loc="lower right")
plt.show()
# NB - Gaussian
from sklearn.naive_bayes import GaussianNB,BernoulliNB
NBGauModel = GaussianNB()
NBGauModel.fit(X_train_scaled,y_train)
y_predict=NBGauModel.predict(X_test_scaled)
print('Accuracy score:',accuracy_score(y_test,y_predict))
print('confuion matrix:\n',confusion_matrix(y_test,y_predict))
print('Recall Score: ',recall_score(y_test, y_predict))
print('Precission Score: ',precision_score(y_test, y_predict))
print('F1 Score: ',f1_score(y_test, y_predict))
draw_cm(y_test, y_predict)
modelComp3=modelComp3.append(pd.DataFrame({'Model':['Naive Bayes - Gaussian'],'Accuracy':[accuracy_score(y_test,y_predict)*100],'Precission':[precision_score(y_test, y_predict)*100],'Recall':[recall_score(y_test, y_predict)*100]}))
y_pred_proba = NBGauModel.predict_proba(X_test_scaled)[:, 1]
[fpr3, tpr3, thr3] = roc_curve(y_test, y_pred_proba)
plt.figure()
plt.clf()
plt.plot(fpr, tpr, color='coral', label='ROC curve LogReg (area = %0.2f)' % auc(fpr, tpr))
plt.plot(fpr2, tpr2, color='g', label='ROC curve KNN (area = %0.2f)' % auc(fpr2, tpr2))
plt.plot(fpr3, tpr3, color='b', label='ROC curve NaiveBayes (area = %0.2f)' % auc(fpr3, tpr3))
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.0])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiverrating characteristic example')
plt.legend(loc="lower right")
plt.show()
#Default
modelComp
#Removed few columns to reduce complexity
modelComp2
#Scaled data
modelComp3
# KNN theory referenced from - https://medium.com/analytics-vidhya/why-is-scaling-required-in-knn-and-k-means-8129e4d88ed7
#End OF File